home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Shareware Grab Bag
/
Shareware Grab Bag.iso
/
002
/
texttool.arc
/
INDEX.C
next >
Wrap
C/C++ Source or Header
|
1987-02-11
|
8KB
|
256 lines
/* index.c
a program to prepare an ordered list of "words" and their frequencies.
a "word" is defined as a string of alphabetic characters (A-Z, a-z)
or digits (0-9), freely intermixed. */
#include <stdio.h>
#include <ctype.h>
#define YES 1
#define NO 0
#define WORDLEN 20
#define LETTER 0
#define NUMBER 1
#define OTHER 2
#define KILOBYTE 1024
char buffer[BUFSIZ];
typedef struct list
{
char *wd; /*text for a unique word */
int count; /* count the occurences of the word*/
struct list *left; /* left pointer*/
struct list *right; /* right pointer*/
} LIST, *LISTPTR ;
int nwords;
int n_unique_words;
main (argc, argv) /* index.c */
int argc;
char *argv[];
{
LISTPTR root;
LISTPTR countword();
char word[WORDLEN];
int c, /* individual character */
j, /* counter for word - output */
i, /* counter for buffer - input */
nchar, /* number of characters read from text file */
inword, /* flag - indicates whether we are in a word */
accpt; /* return value of accept(): indicates whether
a character is "in" a word. These include
characters, (single) hyphens, (single)
apostrophies, but not numbers and other ascii
values */
int bufno = 0;
FILE *fopen(),
*fp;
FILE *fpout;
int fread();
char out_file[13];
/*search the command line for a file to read: exit if none */
if (argc == 1)
{ printf ("no file to index");
exit (1);
}
else
{
while (--argc > 0)
if ((fp = fopen(*++argv, "r")) == NULL)
{ printf("index: can't open %s\n",*argv);
break;
}
strcpy(out_file,*argv);
for (i=0; i<9 && out_file[i] != '.'; ++i)
;
out_file[++i] = NULL;
strcat (out_file, "txt"); /* works */
printf("OUTPUT file: %s\n",out_file);
fpout = fopen(out_file, "wb"); /*open output file*/
if (fpout == NULL)
{
printf ("index: cannot open output file");
exit(1);
}
printf("fpout = %d\n",fpout);
}
inword = NO;
root = NULL;
n_unique_words = c = j = 0;
while ( nchar = fread (buffer, 1, BUFSIZ , fp) )
/* process until no characters returned (EOF)*/
{
/* printf("nchar = %d, bufno = %d\n",nchar,++bufno);*/
/* printf("buffer contents %s\n",buffer);*/
for (i=0; i < nchar ; ++i) /*nchar is a count. nchar-1 is last valid
index */
{
if ((accpt = accept( c = buffer[i])) != LETTER)
{
if (inword == YES)
{
word[j] = '\0';
/* printf("%s \n",word);*/
if (validword(word) )
root = countword (root,word,j+1);
inword = NO;
j = 0;
}
/* if(c == '\n')
inword = NO;*/
}
else if (accpt == LETTER )
{
if (j < WORDLEN) word[j++] = tolower (c);
inword = YES;
}
}
/* printf("finished running the buffer \n");*/
/* printf("feof(fp) %d, inword %d\n",feof(fp),inword);*/
if (feof(fp) && inword == YES)
{
word[j] = '\0';
if (validword(word) )
root = countword (root, word,j+1);
}
}
printf("completed reading file -- output ?follows?\n");
sprintf (buffer,"total words %4d",n_unique_words); /*write total*/
strcat (buffer, "\r\n"); /*mandatory control chars*/
fwrite (buffer, 1, strlen(buffer), fpout);
treeprint(root,fpout); /*call recursive print*/
} /*end of program */
/* RECURSIVE routine to build a tree of words and their frequences*/
/* the final return address is the address of the ROOT node */
struct list *countword (p, word, wd_len) /*install word at or below p */
struct list *p; /*count word if already in tree*/
char word[];
int wd_len; /* including the terminating NULL */
{
static char *wd_buffer = NULL,
*wd_buffer_top = NULL;
char *malloc();
int cond;
/* printf(".");*/
/* printf("countword: %s len= %d\n",word,wd_len);*/
if (p == NULL) /*new word - make a new node */
{
p = (LISTPTR ) malloc (sizeof(LIST)); /*allocade a node*/
if (p == NULL)
{
/*error handling */
;
}
n_unique_words += 1; /* count the new word */
/*====make sure there is enough room to store a new word. If not allocate space*/
if ((wd_buffer_top - wd_buffer) < wd_len) /*how much space remains?*/
{
wd_buffer = (char *) malloc(KILOBYTE); /*allocate a chunk*/
if (wd_buffer == NULL) /*if no space available, take action*/
{
/* error handling */
;
}
wd_buffer_top = wd_buffer + KILOBYTE ; /* top of chunk (+1) */
}
p -> wd = wd_buffer; /* pointer to the new word*/
while (*wd_buffer++ = *word++) /*copy the word /*
; /*to a home */
p -> count = 1; /* count it */
p -> left = p->right = NULL; /*null the child pointers*/
}
else if ((cond = strcmp(word, p->wd)) == 0)
p->count++; /*repeated word*/
else if (cond < 0) /*lower, go left*/
p->left = countword (p->left,word,wd_len);
else /*greater, go right*/
p->right = countword (p->right,word,wd_len);
return(p);
}
strcmp (s, t)
char s[];
char t[];
{
int diff;
while ((diff = *s-*t) == 0 && *t != '\0' && *s != '\0')
{
++s;
++t;
}
return (diff);
}
treeprint(p,fpout)
struct list *p;
FILE *fpout;
{
if (p!=NULL)
{
treeprint(p->left,fpout);
sprintf(buffer,"%4d %s",p->count,p->wd);
strcat( buffer, "\r\n");
fwrite (buffer, 1, strlen(buffer), fpout);
treeprint(p->right,fpout);
}
}
accept (c)
char c;
{
if (isalpha(c) || c == '\'') return (LETTER);
if (isdigit(c)) return (NUMBER);
return (OTHER);
}
validword (w)
char w[];
{
int i;
if (w[0] == '-' ) return(0); /*initial hyphens are invalid */
for (i=0; i<WORDLEN-1 && w[i] !=NULL; ++i)
{
/* if (w[i] == '-' && w[i+1] == '-') return(0); */
/* double embedded '-'is invalid */
if(w[i] == '\'' && w[i+1] == '\'') return(0);
/*double apostrophie is invalid */
}
return (1);
}